Project 2: Explore Bikeshare Data

Get Ready

library(plyr)
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise, summarize

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(ggpubr)
Loading required package: ggplot2
Loading required package: magrittr

Attaching package: ‘ggpubr’

The following object is masked from ‘package:plyr’:

    mutate
library(tidyr)

Attaching package: ‘tidyr’

The following object is masked from ‘package:magrittr’:

    extract
library(ggplot2)
library(lubridate)
package ‘lubridate’ was built under R version 3.6.2
Attaching package: ‘lubridate’

The following objects are masked from ‘package:dplyr’:

    intersect, setdiff, union

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
library(gridExtra)

Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine

Load data sets

# Load data sets
ny <- read.csv("new_york_city.csv")
wash <- read.csv("washington.csv")
chi <- read.csv("chicago.csv")

Inspect the New York City data set

# Inspect the New York City data set
head(ny)
dim(ny)
[1] 54770     9
colnames(ny)
[1] "X"             "Start.Time"    "End.Time"      "Trip.Duration" "Start.Station" "End.Station"   "User.Type"    
[8] "Gender"        "Birth.Year"   
str(ny)
'data.frame':   54770 obs. of  9 variables:
 $ X            : int  5688089 4096714 2173887 3945638 6208972 1285652 1675753 1692245 2271331 1558339 ...
 $ Start.Time   : Factor w/ 54568 levels "2017-01-01 00:17:01",..: 45448 32799 17316 31589 49688 10220 13390 13509 18111 12449 ...
 $ End.Time     : Factor w/ 54562 levels "201","2017-01-01 00:30:56",..: 45432 32783 17295 31567 49668 10204 13364 13505 18092 12422 ...
 $ Trip.Duration: int  795 692 1325 703 329 998 478 4038 5132 309 ...
 $ Start.Station: Factor w/ 636 levels "","1 Ave & E 16 St",..: 522 406 10 93 5 521 325 309 151 245 ...
 $ End.Station  : Factor w/ 638 levels "","1 Ave & E 16 St",..: 613 8 362 558 269 107 389 110 151 243 ...
 $ User.Type    : Factor w/ 3 levels "","Customer",..: 3 3 3 3 3 3 3 3 2 3 ...
 $ Gender       : Factor w/ 3 levels "","Female","Male": 3 3 3 2 3 3 3 3 1 3 ...
 $ Birth.Year   : num  1998 1981 1987 1986 1992 ...
summary(ny)
       X                         Start.Time                   End.Time     Trip.Duration                     Start.Station  
 Min.   :     47   2017-05-11 18:26:10:    3   2017-01-03 08:54:10:    2   Min.   :     61.0   Pershing Square North:  592  
 1st Qu.:1712425   2017-01-04 13:58:24:    2   2017-01-04 17:21:55:    2   1st Qu.:    368.0   W 21 St & 6 Ave      :  385  
 Median :3418634   2017-01-09 09:36:01:    2   2017-01-05 17:25:17:    2   Median :    610.0   Broadway & E 22 St   :  383  
 Mean   :3415873   2017-01-21 15:36:56:    2   2017-01-12 08:34:01:    2   Mean   :    903.6   E 17 St & Broadway   :  380  
 3rd Qu.:5123382   2017-01-21 17:49:59:    2   2017-01-12 09:41:54:    2   3rd Qu.:   1051.0   West St & Chambers St:  364  
 Max.   :6816152   2017-01-21 20:08:29:    2   2017-01-12 20:34:42:    2   Max.   :1088634.0   W 20 St & 11 Ave     :  329  
                   (Other)            :54757   (Other)            :54758   NA's   :1           (Other)              :52337  
                End.Station         User.Type        Gender        Birth.Year  
 Pershing Square North:  556             :  119         : 5410   Min.   :1885  
 E 17 St & Broadway   :  445   Customer  : 5558   Female:12159   1st Qu.:1970  
 Broadway & E 22 St   :  427   Subscriber:49093   Male  :37201   Median :1981  
 W 21 St & 6 Ave      :  365                                     Mean   :1978  
 W 20 St & 11 Ave     :  344                                     3rd Qu.:1988  
 W 38 St & 8 Ave      :  338                                     Max.   :2001  
 (Other)              :52295                                     NA's   :5218  

Inspect the Washington D.C. data set

# Inspect the Washington D.C. data set
head(wash)
dim(wash)
[1] 89051     7
colnames(wash)
[1] "X"             "Start.Time"    "End.Time"      "Trip.Duration" "Start.Station" "End.Station"   "User.Type"    
str(wash)
'data.frame':   89051 obs. of  7 variables:
 $ X            : int  1621326 482740 1330037 665458 1481135 1148202 1594275 1601832 574182 327058 ...
 $ Start.Time   : Factor w/ 81223 levels "","2017-01-01 00:11:00",..: 74753 19510 59964 26708 67716 50891 73381 73775 23142 13333 ...
 $ End.Time     : Factor w/ 81217 levels "","2017-01-01 00:14:00",..: 74744 19473 59981 26732 67753 50918 73397 73775 23114 13350 ...
 $ Trip.Duration: num  489 403 637 1827 1549 ...
 $ Start.Station: Factor w/ 478 levels "","10th & E St NW",..: 27 478 66 221 278 84 368 82 71 60 ...
 $ End.Station  : Factor w/ 479 levels "","10th & E St NW",..: 47 219 144 312 315 239 162 376 51 308 ...
 $ User.Type    : Factor w/ 3 levels "","Customer",..: 3 3 3 2 3 3 3 3 3 3 ...
summary(wash)
       X                         Start.Time                   End.Time     Trip.Duration     
 Min.   :      7   2017-02-19 12:19:00:    6   2017-03-09 17:54:00:    7   Min.   :    60.3  
 1st Qu.: 434587   2017-02-20 11:35:00:    6   2017-03-28 18:11:00:    7   1st Qu.:   410.9  
 Median : 872858   2017-02-24 17:46:00:    6   2017-01-13 17:48:00:    6   Median :   707.0  
 Mean   : 873881   2017-03-01 08:20:00:    6   2017-01-31 08:49:00:    6   Mean   :  1234.0  
 3rd Qu.:1313305   2017-03-02 08:39:00:    6   2017-02-13 18:09:00:    6   3rd Qu.:  1233.2  
 Max.   :1751392   2017-03-09 17:31:00:    6   2017-02-20 11:38:00:    6   Max.   :904591.4  
                   (Other)            :89015   (Other)            :89013   NA's   :1         
                              Start.Station                                                  End.Station         User.Type    
 Columbus Circle / Union Station     : 1700   Columbus Circle / Union Station                      : 1767             :    1  
 Lincoln Memorial                    : 1546   Jefferson Dr & 14th St SW                            : 1603   Customer  :23450  
 Jefferson Dr & 14th St SW           : 1488   Lincoln Memorial                                     : 1514   Subscriber:65600  
 Massachusetts Ave & Dupont Circle NW: 1219   Massachusetts Ave & Dupont Circle NW                 : 1344                     
 Jefferson Memorial                  : 1068   Smithsonian-National Mall / Jefferson Dr & 12th St SW: 1103                     
 15th & P St NW                      : 1040   15th & P St NW                                       : 1077                     
 (Other)                             :80990   (Other)                                              :80643                     

Inspect the Chicago data set

# Inspect the Chicago data set
head(chi)
dim(chi)
[1] 8630    9
colnames(chi)
[1] "X"             "Start.Time"    "End.Time"      "Trip.Duration" "Start.Station" "End.Station"   "User.Type"    
[8] "Gender"        "Birth.Year"   
str(chi)
'data.frame':   8630 obs. of  9 variables:
 $ X            : int  1423854 955915 9031 304487 45207 1473887 961916 65924 606841 135470 ...
 $ Start.Time   : Factor w/ 8624 levels "2017-01-01 00:40:14",..: 7876 5303 73 1721 267 8173 5347 368 3376 795 ...
 $ End.Time     : Factor w/ 8625 levels "2017-01-01 00:46:32",..: 7876 5303 73 1722 267 8173 5346 368 3376 796 ...
 $ Trip.Duration: int  321 1610 416 350 534 586 281 723 689 493 ...
 $ Start.Station: Factor w/ 472 levels "2112 W Peterson Ave",..: 468 424 291 80 103 119 22 255 374 420 ...
 $ End.Station  : Factor w/ 471 levels "","2112 W Peterson Ave",..: 132 381 469 409 151 70 467 251 200 118 ...
 $ User.Type    : Factor w/ 3 levels "","Customer",..: 3 3 3 3 3 3 3 2 3 3 ...
 $ Gender       : Factor w/ 3 levels "","Female","Male": 3 2 3 3 3 3 2 1 3 3 ...
 $ Birth.Year   : num  1992 1992 1981 1986 1975 ...
summary(chi)
       X                         Start.Time                  End.Time    Trip.Duration                          Start.Station 
 Min.   :     36   2017-01-24 07:40:32:   2   2017-04-16 13:16:52:   2   Min.   :   60.0   Streeter Dr & Grand Ave     : 210  
 1st Qu.: 386722   2017-04-22 13:16:25:   2   2017-04-26 16:29:26:   2   1st Qu.:  394.2   Lake Shore Dr & Monroe St   : 140  
 Median : 773554   2017-05-27 15:17:50:   2   2017-05-21 16:20:56:   2   Median :  670.0   Clinton St & Washington Blvd: 120  
 Mean   : 776721   2017-06-10 13:29:41:   2   2017-05-27 09:58:21:   2   Mean   :  937.2   Clinton St & Madison St     : 102  
 3rd Qu.:1171266   2017-06-20 17:05:11:   2   2017-06-25 14:51:35:   2   3rd Qu.: 1119.0   Canal St & Adams St         : 101  
 Max.   :1551248   2017-06-21 13:18:52:   2   2017-01-01 00:46:32:   1   Max.   :85408.0   Michigan Ave & Oak St       :  98  
                   (Other)            :8618   (Other)            :8619                     (Other)                     :7859  
                       End.Station        User.Type       Gender       Birth.Year  
 Streeter Dr & Grand Ave     : 233             :   1         :1748   Min.   :1899  
 Clinton St & Madison St     : 145   Customer  :1746   Female:1723   1st Qu.:1975  
 Theater on the Lake         : 131   Subscriber:6883   Male  :5159   Median :1984  
 Lake Shore Dr & Monroe St   : 115                                   Mean   :1981  
 Clinton St & Washington Blvd: 109                                   3rd Qu.:1989  
 Lake Shore Dr & North Blvd  : 102                                   Max.   :2002  
 (Other)                     :7795                                   NA's   :1747  

Preparations (Joining the Data Sets)

Before joining the data sets, I include the variable city for each of the original data sets in order to be able to identify which city the observations belong to later on. In addition, I also want to exclude all observations with missing values.

Built function to include variable city and exclude all observations with missing values

# Built function to include variable city and exclude all observations with missing values
city_omit <- function(x) {
  city_name <- deparse(substitute(x))
  x%>%
  mutate(city = city_name)%>%
  na.omit()
}

Use function to include variable city and exclude all observations with missing values for each city

# Use function to include variable city and exclude all observations with missing values for each city
ny <- city_omit(ny)
wash <- city_omit(wash)
chi <- city_omit(chi)

Check results of the function

# Check results of the function
head(ny)
dim(ny)
[1] 49552    10
head(wash)
dim(wash)
[1] 89050     8
head(chi)
dim(chi)
[1] 6883   10

Use plyr’s rbind.fill() function to join all three data sets (Union)

# Use plyr's rbind.fill() function to join all three data sets (Union)
bikeshare <- rbind.fill(ny, wash, chi)
head(bikeshare)
tail(bikeshare)

Build function to check if the sizes of joined data sets are equal

### Build function to check if sizes of joined data sets are equal
check <- function(x, y) {
  ifelse(x == y, print("The size of the data sets is equal"), print("Error"))
}

Quickly check if the size of the joined data set is equal to the sum of the individual data sets

# Quickly check if the size of the joined data set is equal to the sum of the individual data sets
x1 <- nrow(bikeshare)
y1 <- nrow(ny) + nrow(wash) + nrow(chi)
check(x1, y1)
[1] "The size of the data sets is equal"
[1] "The size of the data sets is equal"

Add id column to uniquely identify each observation

# Add id column to uniquely identify each observation
bikeshare$id <- seq.int(nrow(bikeshare))
head(bikeshare)
tail(bikeshare)

Question 1

Which city is using the bikeshare service for the longest trip duration on average? In other words, what is the average travel time for users in different cities?

Due to the fact, that the variable Trip.Duration is declared in seconds, create a new variable that transforms Trip.Duration from seconds to minutes

# Due to the fact, that the variable Trip.Duration is declared in seconds, create a new variable that transforms
# Trip.Duration from seconds to minutes
bikeshare$minutes <-  round((bikeshare$Trip.Duration / 60), 2)
head(bikeshare)

Create plot showing the distribution of trip durations in minutes for each of the three cities, as well as their mean and median

# Create plot showing the distribution of trip durations in minutes for each of the three cities, as well as their mean and median
ggplot(aes(x=minutes), data=bikeshare) +
  geom_histogram(binwidth = 0.5, color = "white", fill = "grey26") +
  coord_cartesian(xlim=c(0, quantile(bikeshare$minutes, 0.95))) + # Omit top 5% outliers
  scale_x_continuous(breaks = seq(0, 45, 5)) +
  labs(title="Distribution of Trip Duration (in Minutes) per City", caption = "Median = Blue \n Mean = Red") +
  xlab("Trip Duration in Minutes") + ylab("Frequency") +
  facet_wrap(~city, nrow = 3, scales="free_y", labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington"))) +
  stat_central_tendency(type = "mean", color = "red", show.legend = TRUE) +
  stat_central_tendency(type = "median", color = "blue", show.legend = TRUE)

Create boxplot to see how long the majority of customers uses the bikesharing serivce

# Create a boxplot to see how long the majority of customers uses the bikesharing service
ggplot(aes(x=minutes), data=bikeshare) +
  geom_boxplot() +
  coord_cartesian(xlim=c(0, quantile(bikeshare$minutes, 0.95))) + # Omit top 5% outliers
  xlab("Trip Duration in Minutes") +
  scale_x_continuous(breaks=seq(0, 45, 5)) +
  facet_wrap(~city, nrow = 3, labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington")))

Calculate summary statistcs

# Calculate summary statistcs
bikeshare_summary <- bikeshare%>%
  group_by(city)%>%
  summarise(mean = mean(minutes),
            median = median(minutes),
            min(minutes),
            max(minutes),
            observations = length(minutes))
bikeshare_summary

Answer: Customers in Washington D.C. use the bikesharing service for the longest trip duration, namely for roughly 20min and 30 seconds on average. In comparision, customers in New York City use the service on average 13min and 15 seconds, while people in Chiacgo take the shortest trips with 11min and 30 seconds.

The median for each of the cities is less spread out. The median customer uses the service almost equally in Chicago and New York with a little less than 10 minutes, while the median customer in Washington still uses the service for roughly 12 minutes.

The difference between mean and median can be explained, due to the fact that more customers in Washington use the bike service for longer trips, as is visible by the longer tail in the distribution.

The boxplots shows that the majority of customers (25th to 75 percentile) in New York and Chicago use the bike sharing service for a trip duration between 5 and 17 minutes, while in Washington D.C. the majority of customers use the service between 7 and 21 minutes.

Question 2

Does age have an effect on the duration of a costumer’s trip?

Create a variable age using the existing variable Birth.Year

# Create a variable age using the existing variable Birth.Year
bikeshare$age <- 2020 - bikeshare$Birth.Year
head(bikeshare)

Create new data frame without missing values for Washington

# Create new data frame without missing values for Washington
bikeshare_age <- bikeshare%>%
  na.omit()
head(bikeshare_age)

Create scatter plot showing the correlation between age and trip duration in minutes

# Create scatter plot showing the correlation between age and trip duration in minutes
ggplot(aes(x=age, y=minutes, color=city), data=bikeshare_age) +
  geom_jitter() +
  geom_smooth(method = "lm", color = "black", linetype = "dashed") +
  ggtitle("Correlation between Customer Age and Trip Duration") +
  coord_cartesian(xlim=c(15, quantile(bikeshare_age$age, 0.9995)),
                  ylim=c(0, quantile(bikeshare_age$minutes, 0.9995))) +
  scale_x_continuous(breaks=seq(15, 100, 5)) +
  xlab("Age") + ylab("Trip Duration (in Minutes)") +
  scale_color_discrete(name = "City", labels = c("Chicago", "New York"))

Calculate actual correlation

# Calculate actual correlation
round(cor(bikeshare_age$age, bikeshare_age$minutes), 5)
[1] -0.00132

The actual correlation coefficient for the two variables trip duration (in minutes) and age is close to 0.

There also seems to be no difference between the cities of Chicago and New York. These are the only two cities for which data is available.

Question 3

What is the most common hour of the day for customers to use the bikesharing serivce?

Extract both the hour in which a trip started, as well as ended from Start.Time and End.Time respectively

# Extract both the hour in which a trip started, as well as ended from Start.Time and End.Time respectively
bikeshare$hour_start <- hour(bikeshare$Start.Time)
tz(): Don't know how to compute timezone for object of class factor; returning "UTC". This warning will become an error in the next major version of lubridate.
bikeshare$hour_end <- hour(bikeshare$End.Time)
tz(): Don't know how to compute timezone for object of class factor; returning "UTC". This warning will become an error in the next major version of lubridate.
head(bikeshare)

Create histogram for tje start of the trip to see when most customers start their trips during the day

# Create histogram for tje start of the trip to see when most customers start their trips during the day
q3_1 <- ggplot(aes(hour_start), data=bikeshare) +
  geom_histogram(binwidth = 1, color = "white", fill = "blue") +
  scale_x_continuous(breaks=seq(0, 23, 1)) +
  theme(axis.text=element_text(size=6)) +
  xlab("Hour of the Day") + ylab("Number of Started Trips") +
  facet_wrap(~city, nrow = 3, scales="free_y", labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington")))
q3_1

Create histogram for the end of the trip to see when most customers end their trips at any given day

# Create histogram for the end of the trip to see when most customers end their trips at any given day
q3_2 <- ggplot(aes(hour_end), data=bikeshare) +
  geom_histogram(binwidth = 1, color = "white", fill = "green") +
  scale_x_continuous(breaks=seq(0, 23, 1)) +
  theme(axis.text=element_text(size=6)) +
  xlab("Hour of the Day") + ylab("Number of Ended Trips") +
  facet_wrap(~city, nrow = 3, scales="free_y", labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington")))
q3_2

Combine both plots to see the busiest times for the use of the bikeshare service per city and hour of the day

# Combine both plots to see the busiest times for the use of the bikeshare service per city and hour of the day
grid.arrange(q3_1, q3_2, ncol = 2)

Combine both histogram in one plot to view the busiest time combined

# Combine both histogram in one plot to view the busiest time combined
ggplot(aes(hour_start), data=bikeshare) +
  geom_histogram(binwidth = 1, color = "white", fill = "blue", alpha = 0.4, position="identity") +
  geom_histogram(aes(x=hour_end), data=bikeshare, binwidth = 1, color = "white", fill = "green", alpha = 0.4, position="identity") +
  scale_x_continuous(breaks=seq(0, 23, 1)) +
  xlab("Hour of the Day") + ylab("Number of Trips") +
  facet_wrap(~city, nrow = 3, scales="free_y", labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington")))

Count the number of trips that STARTED in each hour of the day by city

# Count the number of trips that STARTED in each hour of the day by city
hours_start <- bikeshare%>%
  group_by(city, hour_start)%>%
  summarise(n_start = length(id))
hours_start

Extract the hour of the day with the maximum amount of trips STARTED

# Extract the hour of the day with the maximum amount of trips STARTED
max_start_hours <- hours_start%>%
  group_by(city)%>%
  filter(n_start==max(n_start))
max_start_hours

Count the number of trips that END in each hour of the day by city

# Count the number of trips that END in each hour of the day by city
hours_end <- bikeshare%>%
  group_by(city, hour_end)%>%
  summarise(n_end = length(id))
hours_end

Extract the hour of the day, where most of the trips END

# Extract the hour of the day, where most of the trips END
max_end_hours <- hours_end%>%
  group_by(city)%>%
  filter(n_end==max(n_end))
max_end_hours

Combine data sets containing the maximum values to provide summary statistics

# Combine data sets containing the maximum values to provide summary statistics
max_hours <- max_start_hours%>%
  full_join(max_end_hours, by = "city")
max_hours

Answer: The hour of the day, when the bikesharing service is most used, is in the afternoon for both Chicago and New York City. In both cities the busiest hour of the day is between 5pm and 6pm with roughly 5050 trips starting and ending in New York City and roughly 900 in Chicago.

In comparision, customers in Washington D.C. are more inclined to use the bikesharing service in the mornings with 8am being prime time with almost 10.000 trips.

Question 4

What is the most common trip from start to end by city and what is the average travel time on this trip?

---
title: "Udacity Nanodegree Programming for Data Science with R"
output: html_notebook
author: Yannik Sassmann
---

#  Project 2: Explore Bikeshare Data

## Get Ready
```{r}
# Set work directory and load libraries
setwd("/Users/yanniksassmann/Desktop/Data_Science/R/Udacity_Nanodegree_Data_Science_with_R/R_Project")

install.packages("plyr")
install.packages("dplyr")
install.packages("ggpubr")
install.packages("tidyr")
install.packages("ggplot2")
install.packages("lubridate")
install.packages("gridExtra")
library(plyr)
library(dplyr)
library(ggpubr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
```

### Load data sets
```{r}
# Load data sets
ny <- read.csv("new_york_city.csv")
wash <- read.csv("washington.csv")
chi <- read.csv("chicago.csv")
```

### Inspect the New York City data set
```{r}
# Inspect the New York City data set
head(ny)
dim(ny)
colnames(ny)
str(ny)
summary(ny)
```

### Inspect the Washington D.C. data set
```{r}
# Inspect the Washington D.C. data set
head(wash)
dim(wash)
colnames(wash)
str(wash)
summary(wash)
```

### Inspect the Chicago data set
```{r}
# Inspect the Chicago data set
head(chi)
dim(chi)
colnames(chi)
str(chi)
summary(chi)
```


## Preparations (Joining the Data Sets)

### Before joining the data sets, I include the variable city for each of the original data sets in order to be able to identify which city the observations belong to later on. In addition, I also want to exclude all observations with missing values.

### Built function to include variable city and exclude all observations with missing values
```{r}
# Built function to include variable city and exclude all observations with missing values
city_omit <- function(x) {
  city_name <- deparse(substitute(x))
  x%>%
  mutate(city = city_name)%>%
  na.omit()
}
```

### Use function to include variable city and exclude all observations with missing values for each city
```{r}
# Use function to include variable city and exclude all observations with missing values for each city
ny <- city_omit(ny)
wash <- city_omit(wash)
chi <- city_omit(chi)
```

### Check results of the function
```{r}
# Check results of the function
head(ny)
dim(ny)
head(wash)
dim(wash)
head(chi)
dim(chi)
```

### Use plyr's rbind.fill() function to join all three data sets (Union)
```{r}
# Use plyr's rbind.fill() function to join all three data sets (Union)
bikeshare <- rbind.fill(ny, wash, chi)
head(bikeshare)
tail(bikeshare)
```
### Build function to check if the sizes of joined data sets are equal
```{r}
### Build function to check if sizes of joined data sets are equal
check <- function(x, y) {
  ifelse(x == y, print("The size of the data sets is equal"), print("Error"))
}
```

### Quickly check if the size of the joined data set is equal to the sum of the individual data sets
```{r}
# Quickly check if the size of the joined data set is equal to the sum of the individual data sets
x1 <- nrow(bikeshare)
y1 <- nrow(ny) + nrow(wash) + nrow(chi)
check(x1, y1)
```

### Add id column to uniquely identify each observation
```{r}
# Add id column to uniquely identify each observation
bikeshare$id <- seq.int(nrow(bikeshare))
head(bikeshare)
tail(bikeshare)
```


# Question 1
## Which city is using the bikeshare service for the longest trip duration on average? In other words, what is the average travel time for users in different cities?

### Due to the fact, that the variable Trip.Duration is declared in seconds, create a new variable that transforms Trip.Duration from seconds to minutes
```{r}
# Due to the fact, that the variable Trip.Duration is declared in seconds, create a new variable that transforms
# Trip.Duration from seconds to minutes
bikeshare$minutes <-  round((bikeshare$Trip.Duration / 60), 2)
head(bikeshare)
```

### Create plot showing the distribution of trip durations in minutes for each of the three cities, as well as their mean and median
```{r}
# Create plot showing the distribution of trip durations in minutes for each of the three cities, as well as their mean and median
ggplot(aes(x=minutes), data=bikeshare) +
  geom_histogram(binwidth = 0.5, color = "white", fill = "grey26") +
  coord_cartesian(xlim=c(0, quantile(bikeshare$minutes, 0.95))) + # Omit top 5% outliers
  scale_x_continuous(breaks = seq(0, 45, 5)) +
  labs(title="Distribution of Trip Duration (in Minutes) per City", caption = "Median = Blue \n Mean = Red") +
  xlab("Trip Duration in Minutes") + ylab("Frequency") +
  facet_wrap(~city, nrow = 3, scales="free_y", labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington"))) +
  stat_central_tendency(type = "mean", color = "red", show.legend = TRUE) +
  stat_central_tendency(type = "median", color = "blue", show.legend = TRUE)
```

### Create boxplot to see how long the majority of customers uses the bikesharing serivce
```{r}
# Create a boxplot to see how long the majority of customers uses the bikesharing service
ggplot(aes(x=minutes), data=bikeshare) +
  geom_boxplot() +
  coord_cartesian(xlim=c(0, quantile(bikeshare$minutes, 0.95))) + # Omit top 5% outliers
  xlab("Trip Duration in Minutes") +
  scale_x_continuous(breaks=seq(0, 45, 5)) +
  facet_wrap(~city, nrow = 3, labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington")))
```


### Calculate summary statistcs
```{r}
# Calculate summary statistcs
bikeshare_summary <- bikeshare%>%
  group_by(city)%>%
  summarise(mean = mean(minutes),
            median = median(minutes),
            min(minutes),
            max(minutes),
            observations = length(minutes))
bikeshare_summary
```

### Answer: Customers in Washington D.C. use the bikesharing service for the longest trip duration, namely for roughly 20min and 30 seconds on average. In comparision, customers in New York City use the service on average 13min and 15 seconds, while people in Chiacgo take the shortest trips with 11min and 30 seconds.

### The median for each of the cities is less spread out. The median customer uses the service almost equally in Chicago and New York with a little less than 10 minutes, while the median customer in Washington still uses the service for roughly 12 minutes.

### The difference between mean and median can be explained, due to the fact that more customers in Washington use the bike service for longer trips, as is visible by the longer tail in the distribution.

### The boxplots shows that the majority of customers (25th to 75 percentile) in New York and Chicago use the bike sharing service for a trip duration between 5 and 17 minutes, while in Washington D.C. the majority of customers use the service between 7 and 21 minutes.



# Question 2
## Does age have an effect on the duration of a costumer's trip?

### Create a variable age using the existing variable Birth.Year
```{r}
# Create a variable age using the existing variable Birth.Year
bikeshare$age <- 2020 - bikeshare$Birth.Year
head(bikeshare)
```

### Create new data frame without missing values for Washington
```{r}
# Create new data frame without missing values for Washington
bikeshare_age <- bikeshare%>%
  na.omit()
head(bikeshare_age)
```

### Create scatter plot showing the correlation between age and trip duration in minutes
```{r}
# Create scatter plot showing the correlation between age and trip duration in minutes
ggplot(aes(x=age, y=minutes, color=city), data=bikeshare_age) +
  geom_jitter() +
  geom_smooth(method = "lm", color = "black", linetype = "dashed") +
  ggtitle("Correlation between Customer Age and Trip Duration") +
  coord_cartesian(xlim=c(15, quantile(bikeshare_age$age, 0.9995)),
                  ylim=c(0, quantile(bikeshare_age$minutes, 0.9995))) +
  scale_x_continuous(breaks=seq(15, 100, 5)) +
  xlab("Age") + ylab("Trip Duration (in Minutes)") +
  scale_color_discrete(name = "City", labels = c("Chicago", "New York"))
```

### Calculate actual correlation
```{r}
# Calculate actual correlation
round(cor(bikeshare_age$age, bikeshare_age$minutes), 5)
```

### Answer: There seems to be no correlation between the age of customers and their trip duration. Contrary to popular belief, old customers take as long trips as young customers do. Hence trip duration doesn't decrease with age.

### The actual correlation coefficient for the two variables trip duration (in minutes) and age is close to 0.

### There also seems to be no difference between the cities of Chicago and New York. These are the only two cities for which data is available.



# Question 3
## What is the most common hour of the day for customers to use the bikesharing serivce?

### Extract both the hour in which a trip started, as well as ended from Start.Time and End.Time respectively
```{r}
# Extract both the hour in which a trip started, as well as ended from Start.Time and End.Time respectively
bikeshare$hour_start <- hour(bikeshare$Start.Time)
bikeshare$hour_end <- hour(bikeshare$End.Time)
head(bikeshare)
```

### Create histogram for tje start of the trip to see when most customers start their trips during the day
```{r}
# Create histogram for tje start of the trip to see when most customers start their trips during the day
q3_1 <- ggplot(aes(hour_start), data=bikeshare) +
  geom_histogram(binwidth = 1, color = "white", fill = "blue") +
  scale_x_continuous(breaks=seq(0, 23, 1)) +
  theme(axis.text=element_text(size=6)) +
  xlab("Hour of the Day") + ylab("Number of Started Trips") +
  facet_wrap(~city, nrow = 3, scales="free_y", labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington")))
q3_1
```

### Create histogram for the end of the trip to see when most customers end their trips at any given day
```{r}
# Create histogram for the end of the trip to see when most customers end their trips at any given day
q3_2 <- ggplot(aes(hour_end), data=bikeshare) +
  geom_histogram(binwidth = 1, color = "white", fill = "green") +
  scale_x_continuous(breaks=seq(0, 23, 1)) +
  theme(axis.text=element_text(size=6)) +
  xlab("Hour of the Day") + ylab("Number of Ended Trips") +
  facet_wrap(~city, nrow = 3, scales="free_y", labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington")))
q3_2
```

### Combine both plots to see the busiest times for the use of the bikeshare service per city and hour of the day
```{r}
# Combine both plots to see the busiest times for the use of the bikeshare service per city and hour of the day
grid.arrange(q3_1, q3_2, ncol = 2)
```

### Combine both histogram in one plot to view the busiest time combined
```{r}
# Combine both histogram in one plot to view the busiest time combined
ggplot(aes(hour_start), data=bikeshare) +
  geom_histogram(binwidth = 1, color = "white", fill = "blue", alpha = 0.4, position="identity") +
  geom_histogram(aes(x=hour_end), data=bikeshare, binwidth = 1, color = "white", fill = "green", alpha = 0.4, position="identity") +
  scale_x_continuous(breaks=seq(0, 23, 1)) +
  xlab("Hour of the Day") + ylab("Number of Trips") +
  facet_wrap(~city, nrow = 3, scales="free_y", labeller = as_labeller(c(chi = "Chicago", ny = "New York", wash = "Washington")))
```

### Count the number of trips that STARTED in each hour of the day by city
```{r}
# Count the number of trips that STARTED in each hour of the day by city
hours_start <- bikeshare%>%
  group_by(city, hour_start)%>%
  summarise(n_start = length(id))
hours_start
```

### Extract the hour of the day with the maximum amount of trips STARTED
```{r}
# Extract the hour of the day with the maximum amount of trips STARTED
max_start_hours <- hours_start%>%
  group_by(city)%>%
  filter(n_start==max(n_start))
max_start_hours
```

### Count the number of trips that END in each hour of the day by city
```{r}
# Count the number of trips that END in each hour of the day by city
hours_end <- bikeshare%>%
  group_by(city, hour_end)%>%
  summarise(n_end = length(id))
hours_end
```

### Extract the hour of the day, where most of the trips END
```{r}
# Extract the hour of the day, where most of the trips END
max_end_hours <- hours_end%>%
  group_by(city)%>%
  filter(n_end==max(n_end))
max_end_hours
```

### Combine data sets containing the maximum values to provide summary statistics
```{r}
# Combine data sets containing the maximum values to provide summary statistics
max_hours <- max_start_hours%>%
  full_join(max_end_hours, by = "city")
max_hours
```

### Answer: The hour of the day, when the bikesharing service is most used, is in the afternoon for both Chicago and New York City. In both cities the busiest hour of the day is between 5pm and 6pm with roughly 5050 trips starting and ending in New York City and roughly 900 in Chicago.

### In comparision, customers in Washington D.C. are more inclined to use the bikesharing service in the mornings with 8am being prime time with almost 10.000 trips.



# Question 4
## What is the most common trip from start to end by city and what is the average travel time on this trip?

## Q4.1 - First Part: Find most popular starting station per city

### Sum up all number of times a station has been the starting point of a trip in each city and renaming the newly created variable
```{r}
# Sum up all number of times a station has been the starting point of a trip in each city and renaming the newly created variable
count_start <- ddply(bikeshare, .(Start.Station, city), nrow)
count_start <- count_start%>%
  rename(n = V1)
count_start
```

### Convert the variable n (number of trips that started at a station) to numeric variable for easier handling
```{r}
# Convert the variable n (number of trips that started at a station) to numeric variable for easier handling
count_start$n <- as.double(as.character(count_start$n))
count_start$n <- as.numeric(count_start$n)
str(count_start)
```

### Convert the variable Start.Station from factor to character for easier handling
```{r}
# Convert the variable Start.Station from factor to character for easier handling
count_start$Start.Station <- as.character(as.factor(count_start$Start.Station))
str(count_start)
```

### Get the name of the stations, where most of the trips have started for each of the cities
```{r}
# Get the name of the stations, where most of the trips have started for each of the cities
top_start_stations <- count_start%>%
  group_by(city)%>%
  filter(n==max(n))%>%
  ungroup()
top_start_stations
```

### Reorder data frame in descending order
```{r}
# Reorder data frame in descending order
top_start_stations <- top_start_stations%>%
  arrange(desc(n))%>%
  mutate(city = factor(city, levels=c("wash", "ny", "chi")))
top_start_stations
```

### Create plot with the top start stations per city
```{r}
# Create plot with the top start stations per city
ggplot(aes(x=city, y=n, label=Start.Station), data=top_start_stations) +
  geom_col(color="black", fill="blue") +
  geom_text(size=3, color = "white", position = position_stack(vjust = 0.7)) +
  ggtitle("Most Popular Start Station By City") +
  scale_x_discrete(name = "City", labels = c("Washington", "New York City", "Chicago")) +
  ylab("Number of trips started at station")
```


## Q4.2 - Second Part: Find most popular end stations per city, given trips have started at the most popular starting stations

### Subset original bikeshare data set to only include observations from the most popular starting stations per city
```{r}
# Subset original bikeshare data set to only include observations from the most popular starting stations per city
count_end <- bikeshare%>%
  filter(Start.Station %in% c("Columbus Circle / Union Station", "Pershing Square North", "Clinton St & Washington Blvd"))
count_end
```

### Quickly check if number of started trips is equal in both the top_start_stations data set and the newly created count_end data set by using the function built in the preparation part of the exercise
```{r}
# Quickly check if number of started trips is equal in both the top_start_stations data set and the newly created count_end data set by using the function built in the prepration part of the exercise
x2 <- nrow(count_end)
y2 <- sum(top_start_stations$n)
check(x2, y2)
```

### Sum up the number of times a station has been the end point of a trip given one of the top starting stations was the starting point of a trip for each of the city
```{r}
# Sum up the number of times a station has been the end point of a trip given one of the top starting stations was the starting point
# of a trip for each of the city
count_end <- ddply(count_end, .(End.Station, city, Start.Station), nrow)
count_end <- count_end%>%
  rename(n = V1)
count_end
```

### Convert the variable n (number of trips that ended at a station) to numeric variable for easier handling
```{r}
# Convert the variable n (number of trips that ended at a station) to numeric variable for easier handling
count_end$n <- as.double(as.character(count_end$n))
count_end$n <- as.numeric(count_end$n)
str(count_end)
```

### Convert the variable End.Station from factor to character for easier handling
```{r}
# Convert the variable End.Station from factor to character for easier handling
count_end$End.Station <- as.character(as.factor(count_end$End.Station))
str(count_end)
```

### Get the counts of the stations, where most of the trips ended, given they started at the most popular starting station for each city
```{r}
# Get the counts of the stations, where most of the trips ended, given they started at the most popular starting station for each city
pop_stations_end <- count_end%>%
  group_by(city, Start.Station)%>%
  summarise(max = max(n))
pop_stations_end
```

### # Get the name of the stations, where most of the trips have ended, given they started at the most popular starting station for each city
```{r}
# Get the name of the stations, where most of the trips have ended, given they started at the most popular starting station 
# for each city
ny_end_station <- count_end%>%
  filter(city == "ny" & n == 24)
ny_end_station

wash_end_station <- count_end%>%
  filter(city == "wash" & n == 107)
wash_end_station

chi_end_station <- count_end%>%
  filter(city == "chi" & n == 11)
chi_end_station
```

### Combine individual results from each city to a data frame showing the most popular trips per city
```{r}
# Combine individual results from each city to a data frame showing the most popular trips per city
top_trips <- rbind.fill(ny_end_station, wash_end_station, chi_end_station)%>%
  arrange(desc(n))%>%
  mutate(city = factor(city, levels=c("wash", "ny", "chi")))%>%
  rename(City = city, NumberOfTrips = n)
top_trips
```

### Rearrange column order to increase readability of data frame
```{r}
# Rearrange column order to increase readability of data frame
col_order <- c("City", "Start.Station", "End.Station", "NumberOfTrips")
top_trips <- top_trips[, col_order]
top_trips
```

### Combine Start.Station and End.Station to Trip variable
```{r}
# Combine Start.Station and End.Station to Trip variable
top_trips <- transform(top_trips, Trip = paste(Start.Station, End.Station, sep = " - "))
top_trips
```

### Create plot showing the most popular trips per city
```{r}
# Create plot showing the most popular trips per city
ggplot(aes(x=City, y=NumberOfTrips, color=Trip), data=top_trips) +
  geom_col(color="black", fill="blue") +
  ggtitle("Most Popular Trips By City") +
  geom_text(label = top_trips$Start.Station, size=3, color = "white", position = position_stack(vjust = 0.7)) +
  geom_text(label = "-", size=3, color = "white", position = position_stack(vjust = 0.6)) +
  geom_text(label = top_trips$End.Station, size=3, color = "white", position = position_stack(vjust = 0.4)) +
  scale_y_continuous(breaks=seq(0, 120, 10)) +
  scale_x_discrete(name = "City", labels = c("Washington", "New York City", "Chicago")) +
  ylab("Number of Trips")
```

### Provide summary statistics
```{r}
# Provide summary statistics
top_trips
```

### Answer: The most popular trip for Washington D.C. is the trip from Columbus Circle / Union Station to 8th & F St NE, which was taken 107 times during the observation period. The most popular trip in New York City was between Pershing Square North and W 33 St & 7 Ave with 24 trips and in Chicago the most popular trip was from Clinton St & Washington Blvd to Michigan Ave & Washington St with 11 trips in total.






